

https://upload.wikimedia.org/wikipedia/commons/1/10/JIE_Sankey_V5_Fig1.png

https://upload.wikimedia.org/wikipedia/commons/2/29/Minard.png

https://www.iea.org/sankey/#?c=World&s=Balance

https://ec.europa.eu/eurostat/web/energy/energy-flow-diagrams

https://www.tagesschau.de/inland/btw21/waehlerwanderung-bundestagswahl-103.html

https://www.tagesschau.de/inland/btw21/waehlerwanderung-bundestagswahl-103.html

https://www.tagesschau.de/inland/btw21/waehlerwanderung-bundestagswahl-103.html

https://www.economist.com/graphic-detail/2019/11/01/a-british-election-and-other-uncertainties

https://download.statistik-berlin-brandenburg.de/0c8e82331bc2327a/802f7f020114/SB_A01-03-00_2020j01_BE.xlsx
data = {
'2020': {
# inputs
'start': 3669491, # census at start of the year
'births': 38693,
'immigration': 142923,
# outputs
'deaths': -37642,
'emmigration': -144881,
'error': -4496,
'end': -3664088 # census at the end of the year
}
}
flows = list(data['2020'].values())
labels = list(data['2020'].keys())
flows, labels
([3669491, 38693, 142923, -37642, -144881, -4496, -3664088], ['start', 'births', 'immigration', 'deaths', 'emmigration', 'error', 'end'])
import matplotlib.pyplot as plt
from matplotlib.sankey import Sankey
sankey = Sankey() # init
sankey.add(flows=flows, labels=labels) # add flow(s)
sankey.finish() # finish
plt.show() # show plot

scale = 0.0000001 # large numbers...
sankey = Sankey(scale=scale) # init with scale!
sankey.add(flows=flows, labels=labels)
sankey.finish()
plt.show()

sankey = Sankey(scale=scale)
# 0 (inputs from the left, outputs to the right),
# 1 (from and to the top) or -1 (from and to the bottom).
orientations = [0, -1, 1, -1, 1, -1, 0]
# add flow(s) with orientations
sankey.add(flows=flows, labels=labels, orientations=orientations)
sankey.finish()
plt.show()

pathlengths=[0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1]
sankey = Sankey(scale=scale)
sankey.add(
flows=flows, labels=labels, orientations=orientations,
pathlengths=pathlengths,
) # add flow(s) with orientations and pathlengths
sankey.finish()
plt.show()

def format_number(n):
return '{:,}'.format(abs(n)) # add thousands separator
sankey = Sankey(scale=scale, format=format_number) # add format
sankey.add(
flows=flows, labels=labels, orientations=orientations,
pathlengths=pathlengths,
)
sankey.finish()
plt.show()

sankey = Sankey(scale=scale, format=format_number)
sankey.add(
flows=flows, labels=labels, orientations=orientations,
pathlengths=pathlengths, facecolor='lightgray' # color
)
sankey.finish()
plt.title("Berlin Census 2020") # add title
plt.show()

# add second year
data = {
'2019': {
'start 2019': 3644826,
'births': 39503,
'immigration': 184744,
'deaths': -34739,
'emmigration': -161513,
'error': -3330,
'end 2019': -3669491
},
'2020': {
'start 2020': 3669491,
'births': 38693,
'immigration': 142923,
'deaths': -37642,
'emmigration': -144881,
'error': -4496,
'end 2020': -3664088
}
}
flows_2019 = list(data['2019'].values())
labels_2019 = list(data['2019'].keys())
labels_2019[-1] = None # remove last label
flows_2020 = list(data['2020'].values())
labels_2020 = list(data['2020'].keys())
pathlengths=[0.3, 0.3, 0.1, 0.1, 0.3, 0.5, 0.3] # new pathlengths
sankey = Sankey(scale=scale, format=format_number) # init
sankey.add( # add 2019
flows=flows_2019, labels=labels_2019,
orientations=orientations, pathlengths=pathlengths,
facecolor='lightgray'
)
sankey.add( # add 2020
flows=flows_2020, labels=labels_2020,
orientations=orientations, pathlengths=pathlengths,
prior=0, connect=(len(flows_2019)-1, 0), # connect second flow to first
facecolor='darkgray'
)
sankey.finish()
plt.title("Berlin Census 2019 & 2020") # add title
plt.show()
scale argument for large/small numbersformat difficult to handleimport pandas as pd
from pySankey.sankey import sankey
# create DataFrame from 2020 data
df_2020 = pd.DataFrame([
# start -> deaths
{'source': 'start', 'target': 'deaths', 'value': 37642},
# start -> emmigration
{'source': 'start', 'target': 'emmigration', 'value': 144881},
# start -> error
{'source': 'start', 'target': 'error', 'value': 4496},
# start -> end
{'source': 'start', 'target': 'end', 'value': 3669491},
# births -> end
{'source': 'births', 'target': 'end', 'value': 38693},
# immigration -> end
{'source': 'immigration', 'target': 'end', 'value': 142923},
])
df_2020
| source | target | value | |
|---|---|---|---|
| 0 | start | deaths | 37642 |
| 1 | start | emmigration | 144881 |
| 2 | start | error | 4496 |
| 3 | start | end | 3669491 |
| 4 | births | end | 38693 |
| 5 | immigration | end | 142923 |
sankey(
left=df_2020['source'], right=df_2020['target'],
leftWeight=df_2020['value'],
fontsize=14,
#figure_name="Berlin Census 2020", # used for saving png, not title...
)
pd.DataFramefigure_name not in docstring: used for saving file (not title)from psankey.sankey import sankey
nodes, fig, ax = sankey(
df_2020, aspect_ratio=4/3,
nodelabels=True, linklabels=True, labelsize=5,
)
plt.title("Berlin Census 2020") # add title
plt.show()
# create DataFrame from 2019 & 2020 data
df = pd.DataFrame([
# 2019
{'source': '2019', 'target': '2020', 'value': 3644826},
{'source': '2019', 'target': 'deaths `19', 'value': 34739},
{'source': '2019', 'target': 'emmigration `19', 'value': 161513},
{'source': '2019', 'target': 'error `19', 'value': 3330},
{'source': 'births `19', 'target': '2020', 'value': 39503},
{'source': 'immigration `19', 'target': '2020', 'value': 184744},
# 2020
{'source': '2020', 'target': '2021', 'value': 3669491},
{'source': '2020', 'target': 'deaths `20', 'value': 37642},
{'source': '2020', 'target': 'emmigration `20', 'value': 144881},
{'source': '2020', 'target': 'error `20', 'value': 4496},
{'source': 'births `20', 'target': '2021', 'value': 38693},
{'source': 'immigration `20', 'target': '2021', 'value': 142923},
])
df.head(3)
| source | target | value | |
|---|---|---|---|
| 0 | 2019 | 2020 | 3644826 |
| 1 | 2019 | deaths `19 | 34739 |
| 2 | 2019 | emmigration `19 | 161513 |
nodes, fig, ax = sankey(
df, aspect_ratio=4/3,
nodelabels=True, linklabels=True, labelsize=5,
)
plt.title("Berlin Census 2019 & 2020") # add title
plt.show()
pd.DataFramesREADME.mdnodemodifier to highlight nodesimport holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')
width, height = 600, 400
# run example code
sankey = hv.Sankey([
['A', 'X', 5], ['A', 'Y', 7], ['A', 'Z', 6],
['B', 'X', 2], ['B', 'Y', 9], ['B', 'Z', 4]
])
sankey.opts(width=width, height=height)
# remember the 2020 data?
df_2020.head()
| source | target | value | |
|---|---|---|---|
| 0 | start | deaths | 37642 |
| 1 | start | emmigration | 144881 |
| 2 | start | error | 4496 |
| 3 | start | end | 3669491 |
| 4 | births | end | 38693 |
# pass DataFrame from previous example
sankey = hv.Sankey(df_2020)
sankey.opts(width=width, height=height)
# let's get two steps
df.head()
| source | target | value | |
|---|---|---|---|
| 0 | 2019 | 2020 | 3644826 |
| 1 | 2019 | deaths `19 | 34739 |
| 2 | 2019 | emmigration `19 | 161513 |
| 3 | 2019 | error `19 | 3330 |
| 4 | births `19 | 2020 | 39503 |
sankey = hv.Sankey(df)
sankey.opts(width=width, height=height, cmap='Set2',
edge_color=dim('source').str(),
node_color=dim('target').str())
cmapimport plotly.graph_objects as go
# example from https://plotly.com/python/sankey-diagram/
fig = go.Figure(data=[go.Sankey(
# define nodes
node = dict(
pad = 15, # padding
thickness = 20,
line = dict(color="black", width=0.5),
label = ["A1", "A2", "B1", "B2", "C1", "C2"],
color = "blue"
),
# define flows
link = dict(
# indices correspond to labels
source = [0, 1, 0, 2, 3, 3],
target = [2, 3, 3, 4, 4, 5],
value = [8, 4, 2, 8, 4, 2]
))])
fig.update_layout(
title_text="Basic Sankey Diagram",
width=width, height=height, font_size=10)
fig.show()
# create DataFrame from 2019 & 2020 data and color
s, t, v, c = 'source', 'target', 'value', 'color'
light_green, dark_green = '#b2df8a', '#33a02c'
light_blue, dark_blue = "#a6cee3", "#1f78b4"
light_pink = '#f1b6da'
df = pd.DataFrame([
# 2019
{s: '2019', t: '2020', v: 3644826, c: 'lightgray'},
{s: '2019', t: 'deaths `19', v: 34739, c: light_blue},
{s: '2019', t: 'emmigration `19', v: 161513, c: dark_blue},
{s: '2019', t: 'error `19', v: 3330, c: light_pink},
{s: 'births `19', t: '2020', v: 39503, c: light_green},
{s: 'immigration `19', t: '2020', v: 184744, c: dark_green},
# 2020
{s: '2020', t: '2021', v: 3669491, c: 'lightgray'},
{s: '2020', t: 'deaths `20', v: 37642, c: light_blue},
{s: '2020', t: 'emmigration `20', v: 144881, c: dark_blue},
{s: '2020', t: 'error `20', v: 4496, c: light_pink},
{s: 'births `20', t: '2021', v: 38693, c: light_green},
{s: 'immigration `20', t: '2021', v: 142923, c: dark_green},
])
df.head(3)
| source | target | value | color | |
|---|---|---|---|---|
| 0 | 2019 | 2020 | 3644826 | lightgray |
| 1 | 2019 | deaths `19 | 34739 | #a6cee3 |
| 2 | 2019 | emmigration `19 | 161513 | #1f78b4 |
# create nodes with index from DataFrame
# https://stackoverflow.com/a/69464558
import numpy as np
nodes = np.unique(df[["source", "target"]], axis=None)
nodes = pd.Series(index=nodes, data=range(len(nodes)))
nodes
2019 0 2020 1 2021 2 births `19 3 births `20 4 deaths `19 5 deaths `20 6 emmigration `19 7 emmigration `20 8 error `19 9 error `20 10 immigration `19 11 immigration `20 12 dtype: int64
fig = go.Figure(
data=[
go.Sankey(
node={
"label": nodes.index,
},
link={
"source": nodes.loc[df["source"]],
"target": nodes.loc[df["target"]],
"value": df["value"],
})
]
)
fig.update_layout(
title_text="Berin Census 2019 & 2020",
width=width, height=height, font_size=10)
fig.show()
# create x, y and colors for the nodes
x = [.1, .4, .7, # years
.1, .4, # births
.3, .6, # deaths
.3, .6, # emmigration
.3, .6, # error
.1, .4, # immigration
]
y = [.5, .5, .5, # years
.75, .8, # births
.2, .25, # deaths
.25, .3, # emmigration
.15, .2, # error
.7, .75, # immigration
]
color = ["darkgray", "darkgray", "darkgray",
light_green, light_green,
light_blue, light_blue,
dark_blue, dark_blue,
light_pink, light_pink,
dark_green, dark_green,
]
x, y
([0.1, 0.4, 0.7, 0.1, 0.4, 0.3, 0.6, 0.3, 0.6, 0.3, 0.6, 0.1, 0.4], [0.5, 0.5, 0.5, 0.75, 0.8, 0.2, 0.25, 0.25, 0.3, 0.15, 0.2, 0.7, 0.75])
fig = go.Figure(
data=[
go.Sankey(
arrangement = "freeform",
node={
"label": nodes.index,
"x": x,
"y": y,
"pad": 100, # padding between nodes
"color": color,
},
link={
"source": nodes.loc[df["source"]],
"target": nodes.loc[df["target"]],
"value": df["value"],
"color": df["color"],
})
]
)
fig.update_layout(
title_text="Berin Census 2019 & 2020", font_size=10
)
fig.show()

https://www.ipoint-systems.com/blog/from-data-to-knowledge-the-power-of-elegant-sankey-diagrams/
https://github.com/dringler/talks/tree/master/pydata/berlin2022


All GIFs from https://giphy.com/